#title: "Project - Part I"
#student: Kaimi Huang (kh2908)
#objective of research: the objective of the research is to identify factors that relate to high systolic and diastolic blood pressures
library(foreign)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.2.0 ✓ forcats 0.5.1
## ✓ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
#use the foreign package to convert SAS xpt files downloaded from CDC's website to csv files
#give each file detailed names to better identify the datasets
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DEMO.XPT")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DEMO.csv")
Demographic_Variables_and_Sample_Weights <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DEMO.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DSQTOT.XPT")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DSQTOT.csv")
Dietary_Supplement_Use <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_DSQTOT.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BMX.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BMX.csv")
Body_Measures <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BMX.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPXO.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPXO.csv")
Blood_Pressure_Measures <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPXO.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPQ.csv")
Blood_Pressure_Cholesterol_Questionnaire <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_BPQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ALQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ALQ.csv")
Alcohol_Use <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ALQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ECQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ECQ.csv")
Early_Childhood <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_ECQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_HIQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_HIQ.csv")
Health_Insurance <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_HIQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_INQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_INQ.csv")
Income <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_INQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_SLQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_SLQ.csv")
Sleep_Disorders <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_SLQ.csv')
data <- read.xport("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_OCQ.xpt")
write.csv(data, file="/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_OCQ.csv")
Occupation <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//P_OCQ.csv')
#import two files containing the detailed names for all datasets' columns
column_names_1 <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//Variables in the Total Dietary Supplement File.csv')
column_names_2 <- read.csv('/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project//Demo, body measures, blood pressure, etc.csv')
#combine the variable labels and detailed variable names from the two files by first renaming the first two columns in the files. and then merge the data into one data frame called column_names
colnames(column_names_1)[1] <- "variable_labels"
colnames(column_names_1)[2] <- "variable_names"
colnames(column_names_2)[1] <- "variable_labels"
colnames(column_names_2)[2] <- "variable_names"
column_names <- merge(column_names_1, column_names_2, all=TRUE)
class(column_names)
## [1] "data.frame"
dim(column_names)
## [1] 1258 8
#currently, all the datasets contain cryptic column names. in order to better identify data in the datasets, a nested for loop is used to replace the current column labels with detailed column names
#first, create a list to house all the datasets
dataset_list <- list("Demographic_Variables_and_Sample_Weights" = Demographic_Variables_and_Sample_Weights,
"Dietary_Supplement_Use" = Dietary_Supplement_Use,
"Body_Measures" = Body_Measures,
"Blood_Pressure_Measures" = Blood_Pressure_Measures,
"Blood_Pressure_Cholesterol_Questionnaire" = Blood_Pressure_Cholesterol_Questionnaire,
"Alcohol_Use" = Alcohol_Use,
"Early_Childhood" = Early_Childhood,
"Health_Insurance" = Health_Insurance,
"Income" = Income,
"Sleep_Disorders" = Sleep_Disorders,
"Occupation" = Occupation)
#for every dataset in the dataset_list
for (k in seq_along(dataset_list)){
#a placeholder vector called old_colnames takes its existing column labels
old_colnames <- colnames(dataset_list[[k]])
#every column label in old_colnames
for (i in seq_along(old_colnames)){
#is compared to all the labels in the column_names data frame
for (j in seq_along(column_names$variable_labels)){
#if the column label in old_colnames matches any label in column_names's variable_labels, then the column in the dataset will be renamed by the corresponding variable name from column_names's variable_names
if (old_colnames[i] == column_names$variable_labels[j]){
colnames(dataset_list[[k]])[i] <- column_names$variable_names[j]
}
}
}
}
#the dependent variables are systolic and diastolic blood pressure. since there are three sets of measures in the Blood_Pressure_Measures dataset, it is necessary to check if they are drastically different before deciding which set of observations should be used.
#before a boxplot is used to compare the blood pressure measure sets, data in the original Blood_Pressure_Measures data frame needs to be manipulated so that all the measures for a particular type is in one column and a categorical variable is created to indicate if a measure is the 1st, 2nd, or 3rd reading.
#one data frame for each set of measures is created
Blood_Pressure_Measures_1 <- dataset_list[["Blood_Pressure_Measures"]][, c('Respondent sequence number.', 'Systolic - 1st oscillometric reading', 'Diastolic - 1st oscillometric reading')]
Blood_Pressure_Measures_2 <- dataset_list[["Blood_Pressure_Measures"]][, c('Respondent sequence number.', 'Systolic - 2nd oscillometric reading', 'Diastolic - 2nd oscillometric reading')]
Blood_Pressure_Measures_3 <- dataset_list[["Blood_Pressure_Measures"]][, c('Respondent sequence number.', 'Systolic - 3rd oscillometric reading', 'Diastolic - 3rd oscillometric reading')]
#then, a categorical variable is appended to the data frames to identify the number of reading.
Blood_Pressure_Measures_1[,"Reading"] <- rep('1st reading',length(Blood_Pressure_Measures_1$`Respondent sequence number.`))
Blood_Pressure_Measures_2[,"Reading"] <- rep('2nd reading',length(Blood_Pressure_Measures_2$`Respondent sequence number.`))
Blood_Pressure_Measures_3[,"Reading"] <- rep('3rd reading',length(Blood_Pressure_Measures_3$`Respondent sequence number.`))
#columns 2 & 3 are renamed using the same column names so the merge function can combine the three sets of data
colnames(Blood_Pressure_Measures_1)[2:3] <- c('Systolic','Diastolic')
colnames(Blood_Pressure_Measures_2)[2:3] <- c('Systolic','Diastolic')
colnames(Blood_Pressure_Measures_3)[2:3] <- c('Systolic','Diastolic')
#lastly, the merge function is used to combine all the data
Blood_Pressure_Measures <- merge(Blood_Pressure_Measures_1,Blood_Pressure_Measures_2,all = TRUE)
Blood_Pressure_Measures <- merge(Blood_Pressure_Measures_3,Blood_Pressure_Measures,all = TRUE)
#histograms, boxplots, and summary statistics are used to determine the proper set of readings to use
ggplot(Blood_Pressure_Measures_1, aes(Systolic)) +
geom_histogram(binwidth = 0.10, colour = "blue", bins = 10, na.rm = T) +
labs(title = "Systolic - First Reading")

ggplot(Blood_Pressure_Measures_2, aes(Systolic)) +
geom_histogram(binwidth = 0.10, colour = "green", bins = 10, na.rm = T) +
labs(title = "Systolic - Second Reading")

ggplot(Blood_Pressure_Measures_3, aes(Systolic)) +
geom_histogram(binwidth = 0.10, colour = "red", bins = 10, na.rm = T) +
labs(title = "Systolic - Third Reading")

ggplot(Blood_Pressure_Measures_1, aes(Diastolic)) +
geom_histogram(binwidth = 0.10, colour = "blue", bins = 10, na.rm = T) +
labs(title = "Diastolic - First Reading")

ggplot(Blood_Pressure_Measures_2, aes(Diastolic)) +
geom_histogram(binwidth = 0.10, colour = "green", bins = 10, na.rm = T) +
labs(title = "Diastolic - Second Reading")

ggplot(Blood_Pressure_Measures_3, aes(Diastolic)) +
geom_histogram(binwidth = 0.10, colour = "red", bins = 10, na.rm = T) +
labs(title = "Diastolic - Third Reading")

boxplot(Blood_Pressure_Measures$Systolic ~ Blood_Pressure_Measures$Reading)

boxplot(Blood_Pressure_Measures$Diastolic ~ Blood_Pressure_Measures$Reading)

pressure_measure_summaries <- list(
systolic = list(first = summary(Blood_Pressure_Measures_1$Systolic),
second = summary(Blood_Pressure_Measures_2$Systolic),
third = summary(Blood_Pressure_Measures_3$Systolic)),
diastolic = list(first = summary(Blood_Pressure_Measures_1$Diastolic),
second = summary(Blood_Pressure_Measures_2$Diastolic),
third = summary(Blood_Pressure_Measures_3$Diastolic))
)
print(pressure_measure_summaries)
## $systolic
## $systolic$first
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52 106 117 120 130 225 1304
##
## $systolic$second
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 54.0 106.0 116.0 119.7 130.0 222.0 1329
##
## $systolic$third
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 55.0 106.0 117.0 119.7 130.0 220.0 1370
##
##
## $diastolic
## $diastolic$first
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 63.00 71.00 72.04 80.00 151.00 1304
##
## $diastolic$second
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 28.0 63.0 71.0 71.5 79.0 146.0 1329
##
## $diastolic$third
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 30.00 63.00 70.00 71.26 79.00 145.00 1370
#based on the graphs and the statistics, the variances among the three readings are negligible. hence, the first readings will be used as the dependent variable
#the following codes create a scatter plot to verify that systolic and diastolic blood pressures are directly related
ggplot(data = Blood_Pressure_Measures) + geom_point(mapping = aes(x = Diastolic, y = Systolic, color = Reading), na.rm = TRUE)

#generate summary statistics for each variable
bpm_mean_sd <- Blood_Pressure_Measures %>%
select(Systolic, Diastolic, Reading) %>%
group_by(Reading) %>%
summarize(sys_mean = mean(Systolic, na.rm=TRUE),
sys_sd = sd(Systolic, na.rm = TRUE),
dia_mean = mean(Diastolic, na.rm = TRUE),
dia_sd = sd(Diastolic, na.rm = TRUE))
print(bpm_mean_sd)
## # A tibble: 3 × 5
## Reading sys_mean sys_sd dia_mean dia_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1st reading 120. 20.0 72.0 12.4
## 2 2nd reading 120. 19.8 71.5 12.4
## 3 3rd reading 120. 19.6 71.3 12.4
#according to the American College of Cardiology/American Heart Association Guideline for the Prevention, Detection, Evaluation, and Management of High Blood Pressure in Adults published in 2017, each type of blood pressures is categorized into three levels: normal(systolic: less than 120 mm Hg, diastolic: less than 80 mm Hg), elevated/at risk (systolic: 120–129 mm Hg, diastolic: less than 80 mm Hg), and high blood pressure (systolic: 130 mm Hg or higher, diastolic: 80 mm Hg or higher)
#it would be helpful to have sys_blood_pressure_level and dia_blood_pressure_level columns in the Blood_Pressure_Measures_1 data frame for proportion comparisons
#first, create two vectors for blood pressure levels. the vectors will be added to the Blood_Pressure_Measures_1 data frame later
sys_blood_pressure_level <- c()
dia_blood_pressure_level <- c()
#create a placeholder vector called category
category <- c()
#create a vector for level descriptions
blood_pressure_level_description <- c("normal", "elevated", "high")
#use for and if function to classify blood pressure levels for systolic
for (i in seq_along(Blood_Pressure_Measures_1$Systolic)){
if (is.na(Blood_Pressure_Measures_1$Systolic[i])){
category <- NA
}else if (Blood_Pressure_Measures_1$Systolic[i] >= 130){
category <- blood_pressure_level_description[3]
}else if (Blood_Pressure_Measures_1$Systolic[i] >= 120){
category <- blood_pressure_level_description[2]
}else{
category <- blood_pressure_level_description[1]
}
sys_blood_pressure_level <- c(sys_blood_pressure_level, category)
}
#use for and if function to classify blood pressure levels for diastolic
for (i in seq_along(Blood_Pressure_Measures_1$Diastolic)){
if (is.na(Blood_Pressure_Measures_1$Diastolic[i])){
category <- NA
}else if (Blood_Pressure_Measures_1$Diastolic[i] >= 80){
category <- blood_pressure_level_description[3]
}else{
category <- blood_pressure_level_description[1]
}
dia_blood_pressure_level <- c(dia_blood_pressure_level, category)
}
#append the two new vectors to the Blood_Pressure_Measures_1 data frame
Blood_Pressure_Measures_1$sys_blood_pressure_level <- sys_blood_pressure_level
Blood_Pressure_Measures_1$dia_blood_pressure_level <- dia_blood_pressure_level
head(Blood_Pressure_Measures_1)
## Respondent sequence number. Systolic Diastolic Reading
## 1 109264 109 67 1st reading
## 2 109266 99 56 1st reading
## 3 109270 123 73 1st reading
## 4 109271 102 65 1st reading
## 5 109273 116 68 1st reading
## 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level
## 1 normal normal
## 2 normal normal
## 3 elevated normal
## 4 normal normal
## 5 normal normal
## 6 high normal
#the first variable to explore is gender. in general, do men have a higher or lower blood pressure level than women?
#extract the identifier and gender columns from Demographic_Variables_and_Sample_Weights
gender <- data.frame('Respondent sequence number.' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Respondent sequence number.`, 'gender' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Gender of the participant.`)
#merge blood pressure measures and gender data frames
gender <- merge(Blood_Pressure_Measures_1, gender, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#filter for male rows and add a gender description column
male <- gender[gender$gender == 1,]
male$gender_description <- rep("male", length(male$gender))
#do the same for female rows
female <- gender[gender$gender == 2,]
female$gender_description <- rep("female", length(female$gender))
#combine the male and female data frames
gender <- merge(male, female, all = TRUE)
#use magrittr functions to filter for male and systolic, male and diastolic, female and systolic, and female and diastolic data and then create histograms using the data frames
male_systolic <- male %>%
select(`Respondent sequence number.`, Systolic, gender_description)
male_diastolic <- male %>%
select(`Respondent sequence number.`, Diastolic, gender_description)
female_systolic <- female %>%
select(`Respondent sequence number.`, Systolic, gender_description)
female_diastolic <- female %>%
select(`Respondent sequence number.`, Diastolic, gender_description)
ggplot(male_systolic, aes(Systolic)) +
geom_histogram(binwidth = 5, colour = "white", fill = "navy", na.rm = TRUE) +
labs(title="Male Systolic Blood Pressue")

ggplot(male_diastolic, aes(Diastolic)) +
geom_histogram(binwidth = 5, colour = "white", fill = "navy", na.rm = TRUE) +
labs(title="Male Diastolic Blood Pressue")

ggplot(female_systolic, aes(Systolic)) +
geom_histogram(binwidth = 5, colour = "white", fill = "red", na.rm = TRUE) +
labs(title="Female Systolic Blood Pressue")

ggplot(female_diastolic, aes(Diastolic)) +
geom_histogram(binwidth = 5, colour = "white", fill = "red", na.rm = TRUE) +
labs(title="Female Diastolic Blood Pressue")

#boxplots and summary statistics are used to explore the relationship between gender and blood pressure
boxplot(gender$Systolic ~ gender$gender_description)

boxplot(gender$Diastolic ~ gender$gender_description)

#create a vector to house the variable's descriptions. this will be useful for creating summary statistics
description <- c("male", "female")
#create a function to calculate the systolic blood pressure summary statistics of any kind of categorical variable
#the first argument is a data frame. the second parameter argument is an item of a categorical variable. for instance, male of the gender_description categorical variable can be a parameter
systolic_summary_stat <- function(dataframe, parameter){
#filter the 8th column of the specified data frame (will have to make sure the categorical description column for the coming data frames is the 8th column) by the parameter. a new data frame is created to house the filtered data
new_data_frame <- dataframe[dataframe[[8]] == parameter,]
#summary function is used to calculate the systolic statistics for the specified category
systolic <- summary(new_data_frame[["Systolic"]])
return (systolic)
}
#create a second function to calculate the diastolic blood pressure summary statistics of any kind of categorical variable
diastolic_summary_stat <- function(dataframe, parameter){
new_data_frame <- dataframe[dataframe[[8]] == parameter,]
diastolic <- summary(new_data_frame[["Diastolic"]])
return (diastolic)
}
#create a function to put all the systolic and diastolic summary statistics in one list. this function will be useful later when calculating the summary statistics for different variables
#the function takes two arguments: dataframe, where the data for calculation is from, and length_of_description_vector, which determines the number of summary statistics for each type of blood pressure. for instance, there are two items in the description for the gender variable. so, there will be two summary statistics for systolic blood pressure: one for male and the other for female
create_pressure_measure_summaries <- function(dataframe,length_of_description_vector){
#if the length of the description vector is 2, for instance, there are two groups in gender_description
if(length_of_description_vector == 2){
#then create a new list, which contains the systolic and diastolic lists. and each list contains the summary statistics for each categorical group
new_list <- list(
#for instance, in the systolic list, there are two summary statistics for gender: one for male and the other for female
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]))
)
print(new_list)
#next is the situation when the length of the description vector is not 2 but 3. and so on
} else if (length_of_description_vector == 3){
new_list <- list(
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = systolic_summary_stat(dataframe, description[3])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = diastolic_summary_stat(dataframe, description[3]))
)
print(new_list)
}else if (length_of_description_vector == 4){
new_list <- list(
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = systolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = systolic_summary_stat(dataframe, description[4])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = diastolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = diastolic_summary_stat(dataframe, description[4]))
)
print(new_list)
}else if (length_of_description_vector == 5){
new_list <- list(
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = systolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = systolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = systolic_summary_stat(dataframe, description[5])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = diastolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = diastolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = diastolic_summary_stat(dataframe, description[5]))
)
print(new_list)
}else if (length_of_description_vector == 6){
new_list <- list(
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = systolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = systolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = systolic_summary_stat(dataframe, description[5]),
name = description[6], summary_statistic = systolic_summary_stat(dataframe, description[6])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = diastolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = diastolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = diastolic_summary_stat(dataframe, description[5]),
name = description[6], summary_statistic = diastolic_summary_stat(dataframe, description[6]))
)
print(new_list)
}else if (length_of_description_vector == 7){
new_list <- list(
systolic = list(name = description[1], summary_statistic = systolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = systolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = systolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = systolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = systolic_summary_stat(dataframe, description[5]),
name = description[6], summary_statistic = systolic_summary_stat(dataframe, description[6]),
name = description[7], summary_statistic = systolic_summary_stat(dataframe, description[7])),
diastolic = list(name = description[1], summary_statistic = diastolic_summary_stat(dataframe, description[1]),
name = description[2], summary_statistic = diastolic_summary_stat(dataframe, description[2]),
name = description[3], summary_statistic = diastolic_summary_stat(dataframe, description[3]),
name = description[4], summary_statistic = diastolic_summary_stat(dataframe, description[4]),
name = description[5], summary_statistic = diastolic_summary_stat(dataframe, description[5]),
name = description[6], summary_statistic = diastolic_summary_stat(dataframe, description[6]),
name = description[7], summary_statistic = diastolic_summary_stat(dataframe, description[7]))
)
print(new_list)
}
}
pressure_measure_summaries_gender <- create_pressure_measure_summaries(gender, 2)
## $systolic
## $systolic$name
## [1] "male"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.0 110.0 120.0 122.5 132.0 220.0 584
##
## $systolic$name
## [1] "female"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 61.0 103.0 113.0 117.6 128.0 225.0 720
##
##
## $diastolic
## $diastolic$name
## [1] "male"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 64.00 72.00 72.48 80.00 151.00 584
##
## $diastolic$name
## [1] "female"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.0 63.0 70.0 71.6 79.0 137.0 720
#the second variable to explore is age. how is age related to blood pressure?
#extract the identifier and age columns from Demographic_Variables_and_Sample_Weights
age <- data.frame('Respondent sequence number.' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Respondent sequence number.`, 'age' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.`)
#merge the blood pressure measures and age data frames
age <- merge(Blood_Pressure_Measures_1, age, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot scatter plot to explore whether there is a relationship between age and blood pressure
library(ggplot2)
ggplot(age, aes(age, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(age, aes(age, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#create a placeholder character vector called category
category <- c()
#create a new vector called age_group, which will be added to the age data frame later
age_group <- c()
#use for and if functions to categorize participants' age
for (i in seq_along(age$age)){
if (age$age[i] >= 65){
category <- "65+"
}else if (age$age[i] >= 45){
category <- "middle age"
}else if (age$age[i] >= 18){
category <- "young adults"
}else if (age$age[i] >= 12){
category <- "teens"
}else if (age$age[i] >= 1){
category <- "children"
}else {
category <- "infants"
}
age_group <- c(age_group, category)
}
#add the age_group vector to the age data frame
age$age_group <- age_group
#create a vector of age groups in the desired order
description <- c("infants", "children", "teens", "young adults", "middle age", "65+")
#use the factor function to reorder the group levels of age within the data frame so the box plots will appear with an organized order
age$age_group <- factor(age$age_group, levels = description)
#boxplots and summary statistics are used to explore the relationship between age and blood pressure
boxplot(age$Systolic ~ age$age_group)

boxplot(age$Diastolic ~ age$age_group)

#use the function created above to calculate summary statistics
pressure_measure_summaries_age <- create_pressure_measure_summaries(age,length(description))
## $systolic
## $systolic$name
## [1] "infants"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
##
## $systolic$name
## [1] "children"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 61.0 94.0 101.0 100.8 107.0 161.0 196
##
## $systolic$name
## [1] "teens"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.0 101.0 108.0 108.1 114.0 159.0 167
##
## $systolic$name
## [1] "young adults"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 76 105 114 115 123 205 413
##
## $systolic$name
## [1] "middle age"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 78.0 114.0 125.0 127.4 138.0 220.0 263
##
## $systolic$name
## [1] "65+"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 66.0 121.0 134.0 136.1 150.0 225.0 265
##
##
## $diastolic
## $diastolic$name
## [1] "infants"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
##
## $diastolic$name
## [1] "children"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 55.00 59.00 60.11 65.00 124.00 196
##
## $diastolic$name
## [1] "teens"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 59.00 64.00 64.36 69.00 109.00 167
##
## $diastolic$name
## [1] "young adults"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.00 65.00 72.00 72.97 80.00 141.00 413
##
## $diastolic$name
## [1] "middle age"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.0 70.0 78.0 78.3 86.0 140.0 263
##
## $diastolic$name
## [1] "65+"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 65.00 72.00 72.87 80.00 151.00 265
#the third variable to explore is race. does a particular race have higher blood pressure than the rest?
#extract the identifier and race columns from Demographic_Variables_and_Sample_Weights
race <- data.frame('Respondent sequence number.' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Respondent sequence number.`, 'race' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Recode of reported race and Hispanic origin information, with Non-Hispanic Asian Category`)
#merge the blood pressure measures and race data frames
race <- merge(Blood_Pressure_Measures_1, race, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#current race column contains codes. codes will be replaced by detailed descriptions. first, create a vector containing the races in the corresponding code order
description <- c("Mexican American", "Other Hispanic", "White", "Black", "Asian", "Other Race")
#create a new vector for race description, which will be added to the race data frame later
race_description <- c()
#empty the placeholder vector called category
category <- c()
#use for and if function to match the codes in the race column with descriptions
for (i in seq_along(race$race)){
if (race$race[i] == 1){
category <- description[1]
}else if (race$race[i] == 2){
category <- description[2]
}else if (race$race[i] == 3){
category <- description[3]
}else if (race$race[i] == 4){
category <- description[4]
#note that there is no 5 in the race codes
}else if(race$race[i] == 6){
category <- description[5]
}else{
category <- description[6]
}
race_description <- c(race_description, category)
}
#add the race_description vector to the race data frame
race$race_description <- race_description
#boxplots and summary statistics are used to explore the relationship between age and blood pressure
boxplot(race$Systolic ~ race$race_description)

boxplot(race$Diastolic ~ race$race_description)

#use the function created above to calculate summary statistics
pressure_measure_summaries_race <- create_pressure_measure_summaries(race,length(description))
## $systolic
## $systolic$name
## [1] "Mexican American"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 72.0 104.0 113.0 116.1 125.0 217.0 210
##
## $systolic$name
## [1] "Other Hispanic"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 76.0 105.0 116.0 119.8 131.0 225.0 161
##
## $systolic$name
## [1] "White"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.0 107.0 117.0 119.9 130.0 219.0 318
##
## $systolic$name
## [1] "Black"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 75.0 107.0 119.0 123.3 135.5 224.0 346
##
## $systolic$name
## [1] "Asian"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 61.0 106.0 116.0 119.1 129.0 205.0 196
##
## $systolic$name
## [1] "Other Race"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 74.0 104.0 112.0 116.1 125.0 198.0 73
##
##
## $diastolic
## $diastolic$name
## [1] "Mexican American"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 61.00 68.00 69.72 77.00 124.00 210
##
## $diastolic$name
## [1] "Other Hispanic"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.00 63.00 71.00 71.62 79.00 131.00 161
##
## $diastolic$name
## [1] "White"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 63.00 71.00 71.32 79.00 141.00 318
##
## $diastolic$name
## [1] "Black"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 38.00 64.50 73.00 74.34 83.00 151.00 346
##
## $diastolic$name
## [1] "Asian"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.0 64.0 72.0 72.4 79.0 125.0 196
##
## $diastolic$name
## [1] "Other Race"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.00 62.00 69.00 70.67 79.00 140.00 73
#based on the boxplots, all races share similar distributions in both kinds of blood pressures. in order to explore the data further, we will create tables and bar charts to explore proportions based on race
#create bar charts to compare proportions
ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

#create a function to create a table with margin for systolic counts
table_sys <- function (dataframe, categorical_variable){
#the table is by categorical variables and by systolic blood levels and inlcudes NA values
table_sys <- table(dataframe[,categorical_variable],dataframe[,'sys_blood_pressure_level'], useNA = 'always')
#add column and row totals to the table
table_sys <- addmargins(table_sys)
print(table_sys)
}
#create the same function but for diastolic
table_dia <- function (dataframe, categorical_variable){
#the table is by categorical variables and by diastolic blood levels and inlcudes NA values
table_dia <- table(dataframe[,categorical_variable],dataframe[,'dia_blood_pressure_level'], useNA = 'always')
#add column and row totals to the table
table_dia <- addmargins(table_dia)
print(table_dia)
}
#use the functions created above to create tables based on race
table_sys_race <-table_sys(race, "race_description")
##
## elevated high normal <NA> Sum
## Asian 212 267 640 196 1315
## Black 449 910 1408 346 3113
## Mexican American 222 238 812 210 1482
## Other Hispanic 168 269 575 161 1173
## Other Race 94 120 414 73 701
## White 620 929 2005 318 3872
## <NA> 0 0 0 0 0
## Sum 1765 2733 5854 1304 11656
table_dia_race <-table_dia(race, "race_description")
##
## high normal <NA> Sum
## Asian 276 843 196 1315
## Black 891 1876 346 3113
## Mexican American 247 1025 210 1482
## Other Hispanic 242 770 161 1173
## Other Race 145 483 73 701
## White 848 2706 318 3872
## <NA> 0 0 0 0
## Sum 2649 7703 1304 11656
#the fourth variable to explore is education. how is the level of education related to blood pressure?
#extract the identifier and education columns from Demographic_Variables_and_Sample_Weights
education <- data.frame('Respondent sequence number.' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`Respondent sequence number.`, 'education' = dataset_list[["Demographic_Variables_and_Sample_Weights"]]$`What is the highest grade or level of school {you have/SP has} completed or the highest degree {you have/s/he has} received?`)
#merge the blood pressure measures and age data frames
education <- merge(Blood_Pressure_Measures_1, education, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#current education column contains codes. a column containing the detailed descriptions for the codes will be added. first, create a vector containing the education levels in the corresponding code order
description <- c("Less than 9th grade", "9-11th grade", "High school graduate", "Some college", "College graduate or above", "Refused", "Don't Know")
#create a new vector for education level, which will be added to the race data frame later
education_level <- c()
#empty the placeholder vector called category
category <- c()
#use for and if function to replace the codes in the race column with descriptions
for (i in seq_along(education$education)){
if (is.na(education$education[i])){
category <- NA
}else if (education$education[i] == 1){
category <- description[1]
}else if (education$education[i] == 2){
category <- description[2]
}else if (education$education[i] == 3){
category <- description[3]
}else if (education$education[i] == 4){
category <- description[4]
}else if(education$education[i] == 5){
category <- description[5]
#note that there is no 6 in the education codes
}else if(education$education[i] == 7){
category <- description[6]
}else{
category <- description[7]
}
education_level <- c(education_level, category)
}
#add the education_level vector to the education data frame
education$education_level <- education_level
#use the factor function to reorder the education levels within the data frame so the box plots will appear with an organized order. the order is the same as it appears in the description vector above
education$education_level <- factor(education$education_level, levels = description)
#boxplots and summary statistics are used to explore the relationship between education level and blood pressure
boxplot(education$Systolic ~ education$education_level)

boxplot(education$Diastolic ~ education$education_level)

#use the function created above to calculate summary statistics
pressure_measure_summaries_education <- create_pressure_measure_summaries(education,length(description))
## $systolic
## $systolic$name
## [1] "Less than 9th grade"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 81.0 115.0 127.0 131.4 145.0 213.0 3260
##
## $systolic$name
## [1] "9-11th grade"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 78.0 113.0 125.0 128.5 141.0 225.0 3226
##
## $systolic$name
## [1] "High school graduate"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 82.0 112.0 123.0 126.1 137.0 220.0 3314
##
## $systolic$name
## [1] "Some college"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 66.0 110.0 121.0 124.1 135.0 220.0 3350
##
## $systolic$name
## [1] "College graduate or above"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 80.0 110.0 120.0 122.1 131.0 199.0 3298
##
## $systolic$name
## [1] "Refused"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 114 114 114 114 114 114 3113
##
## $systolic$name
## [1] "Don't Know"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 112.0 123.2 128.5 130.8 136.0 154.0 3119
##
##
## $diastolic
## $diastolic$name
## [1] "Less than 9th grade"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.00 67.00 74.00 75.12 83.00 137.00 3260
##
## $diastolic$name
## [1] "9-11th grade"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 45.00 67.00 74.00 75.14 82.00 141.00 3226
##
## $diastolic$name
## [1] "High school graduate"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 38.00 67.00 75.00 75.53 83.00 129.00 3314
##
## $diastolic$name
## [1] "Some college"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.00 67.00 75.00 75.58 83.00 151.00 3350
##
## $diastolic$name
## [1] "College graduate or above"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 67.00 74.00 74.46 81.00 119.00 3298
##
## $diastolic$name
## [1] "Refused"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 70 70 70 70 70 70 3113
##
## $diastolic$name
## [1] "Don't Know"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 59.00 66.50 70.50 69.75 73.75 79.00 3119
#boxplots suggest no significant difference among the means. in order to explore the data further, we will create tables and bar charts to explore proportions based on education levels
#create bar charts to compare proportions
ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education level")

ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education level")

#use the functions created above to create tables based on education level
table_sys_education <-table_sys(education, "education_level")
##
## elevated high normal <NA> Sum
## Less than 9th grade 92 245 181 148 666
## 9-11th grade 152 343 338 114 947
## High school graduate 366 702 790 202 2060
## Some college 519 841 1173 238 2771
## College graduate or above 423 548 930 186 2087
## Refused 0 0 1 1 2
## Don't Know 1 2 1 7 11
## <NA> 212 52 2440 408 3112
## Sum 1765 2733 5854 1304 11656
table_dia_education <-table_dia(education, "education_level")
##
## high normal <NA> Sum
## Less than 9th grade 168 350 148 666
## 9-11th grade 279 554 114 947
## High school graduate 641 1217 202 2060
## Some college 892 1641 238 2771
## College graduate or above 564 1337 186 2087
## Refused 0 1 1 2
## Don't Know 0 4 7 11
## <NA> 105 2599 408 3112
## Sum 2649 7703 1304 11656
#the fifth variable to explore is daily Vitamin D dietary supplement intake. is a high daily Vitamin D consumption associated with low blood pressure?
#extract the identifier and vitamin d columns from Dietary_Supplement_Use data frame
vitamin_d <- data.frame('Respondent sequence number.' = dataset_list[["Dietary_Supplement_Use"]]$`Respondent sequence number.`, 'vitamin d' = dataset_list[["Dietary_Supplement_Use"]]$`Vitamin D (D2 + D3) (mcg)`)
#merge the blood pressure measures and vitamin_d data frames
vitamin_d <- merge(Blood_Pressure_Measures_1, vitamin_d, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(vitamin_d, aes(vitamin.d, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(vitamin_d, aes(vitamin.d, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#scatter plots were not very helpful in showing the relationships between the variables
#use the factor function to reorder the blood pressure levels within the data frame so the box plots will appear in the desired order
order <- c("normal", "elevated", "high")
vitamin_d$sys_blood_pressure_level <- factor(vitamin_d$sys_blood_pressure_level, levels = order)
vitamin_d$dia_blood_pressure_level <- factor(vitamin_d$dia_blood_pressure_level, levels = order)
#use magrittr functions to calculate the means and standard deviations of vitamin.d by systolic and diastolic blood pressure levels
vitamin_d_sys_summary_statistics <- vitamin_d %>%
select('Respondent sequence number.', sys_blood_pressure_level, vitamin.d) %>%
group_by(sys_blood_pressure_level) %>%
summarize(Mean_vidamin_d = mean(vitamin.d, na.rm=TRUE), SD_vitamin_d = sd(vitamin.d, na.rm = TRUE))
print(vitamin_d_sys_summary_statistics)
## # A tibble: 4 × 3
## sys_blood_pressure_level Mean_vidamin_d SD_vitamin_d
## <fct> <dbl> <dbl>
## 1 normal 32.7 60.1
## 2 elevated 51.0 140.
## 3 high 48.5 103.
## 4 <NA> 31.9 37.1
vitamin_d_dia_summary_statistics <- vitamin_d %>%
select('Respondent sequence number.', dia_blood_pressure_level, vitamin.d) %>%
group_by(dia_blood_pressure_level) %>%
summarize(Mean_vidamin_d = mean(vitamin.d, na.rm=TRUE), SD_vitamin_d = sd(vitamin.d, na.rm = TRUE))
print(vitamin_d_dia_summary_statistics)
## # A tibble: 3 × 3
## dia_blood_pressure_level Mean_vidamin_d SD_vitamin_d
## <fct> <dbl> <dbl>
## 1 normal 37.9 85.7
## 2 high 50.5 114.
## 3 <NA> 31.9 37.1
#because of the few large values, boxplots look compressed. zoom in to lower vitamin d values
boxplot(vitamin_d$vitamin.d ~ vitamin_d$sys_blood_pressure_level, ylim = c(0,200))

boxplot(vitamin_d$vitamin.d ~ vitamin_d$dia_blood_pressure_level, ylim = c(0,150))

#the sixth variable to explore is daily dietary fiber intake. is a high daily fiber consumption associated with low blood pressure?
#extract the identifier and dietary fiber columns from Dietary_Supplement_Use data frame
fiber <- data.frame('Respondent sequence number.' = dataset_list[["Dietary_Supplement_Use"]]$`Respondent sequence number.`, 'fiber' = dataset_list[["Dietary_Supplement_Use"]]$`Dietary fiber (gm)`)
#merge the blood pressure measures and fiber data frames
fiber <- merge(Blood_Pressure_Measures_1, fiber, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(fiber, aes(fiber, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(fiber, aes(fiber, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#scatter plots suggests there is no relationship
#use the factor function to reorder the blood pressure levels within the data frame so the box plots will appear in the desired order
order <- c("normal", "elevated", "high")
fiber$sys_blood_pressure_level <- factor(fiber$sys_blood_pressure_level, levels = order)
fiber$dia_blood_pressure_level <- factor(fiber$dia_blood_pressure_level, levels = order)
#use magrittr functions to calculate the means and standard deviations of the variable by systolic and diastolic blood pressure levels
fiber_sys_summary_statistics <- fiber %>%
select('Respondent sequence number.', sys_blood_pressure_level, fiber) %>%
group_by(sys_blood_pressure_level) %>%
summarize(Mean_fiber = mean(fiber, na.rm=TRUE), SD_fiber = sd(fiber, na.rm = TRUE))
print(fiber_sys_summary_statistics)
## # A tibble: 4 × 3
## sys_blood_pressure_level Mean_fiber SD_fiber
## <fct> <dbl> <dbl>
## 1 normal 2.55 3.82
## 2 elevated 2.57 2.42
## 3 high 3.39 4.85
## 4 <NA> 1.66 3.55
fiber_dia_summary_statistics <- fiber %>%
select('Respondent sequence number.', dia_blood_pressure_level, fiber) %>%
group_by(dia_blood_pressure_level) %>%
summarize(Mean_fiber = mean(fiber, na.rm=TRUE), SD_fiber = sd(fiber, na.rm = TRUE))
print(fiber_dia_summary_statistics)
## # A tibble: 3 × 3
## dia_blood_pressure_level Mean_fiber SD_fiber
## <fct> <dbl> <dbl>
## 1 normal 2.91 3.96
## 2 high 2.59 4.09
## 3 <NA> 1.66 3.55
#use boxplots to display distributions of vitamin d consumption by blood pressure levels
boxplot(fiber$fiber ~ fiber$sys_blood_pressure_level)

boxplot(fiber$fiber ~ fiber$dia_blood_pressure_level)

#the seventh variable to explore is Body Mass Index. is a high Body Mass Index associated with high blood pressure?
#extract the identifier and BMI columns from Body_Measures data frame
bmi <- data.frame('Respondent sequence number.' = dataset_list[["Body_Measures"]]$`Respondent sequence number.`, 'bmi' = dataset_list[["Body_Measures"]]$`Body Mass Index (kg/m**2)`)
#merge the blood pressure measures and bmi data frames
bmi <- merge(Blood_Pressure_Measures_1, bmi, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(bmi, aes(bmi, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(bmi, aes(bmi, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#there seems to be a trend, however, not an obvious one. add the gender_description column from the gender data frame and plot again to see if the trend will be clearer
bmi_gender <- merge(bmi, gender, all = TRUE)
#Color the data points by gender_description
ggplot(data = bmi_gender) + geom_point(mapping = aes(x = bmi, y = Systolic, color = gender_description))
## Warning: Removed 1394 rows containing missing values (geom_point).

ggplot(data = bmi_gender) + geom_point(mapping = aes(x = bmi, y = Diastolic, color = gender_description))
## Warning: Removed 1394 rows containing missing values (geom_point).

#try the other categorical variables
#age
bmi_age <- merge(bmi, age, all = TRUE)
#Color the data points by age_group
ggplot(data = bmi_age) + geom_point(mapping = aes(x = bmi, y = Systolic, color = age_group))
## Warning: Removed 1394 rows containing missing values (geom_point).

ggplot(data = bmi_age) + geom_point(mapping = aes(x = bmi, y = Diastolic, color = age_group))
## Warning: Removed 1394 rows containing missing values (geom_point).

#race
bmi_race <- merge(bmi, race, all = TRUE)
#Color the data points by race
ggplot(data = bmi_race) + geom_point(mapping = aes(x = bmi, y = Systolic, color = race_description))
## Warning: Removed 1394 rows containing missing values (geom_point).

ggplot(data = bmi_race) + geom_point(mapping = aes(x = bmi, y = Diastolic, color = race_description))
## Warning: Removed 1394 rows containing missing values (geom_point).

#education
bmi_education <- merge(bmi, education, all = TRUE)
#Color the data points by education level
ggplot(data = bmi_education) + geom_point(mapping = aes(x = bmi, y = Systolic, color = education_level))
## Warning: Removed 1394 rows containing missing values (geom_point).

ggplot(data = bmi_education) + geom_point(mapping = aes(x = bmi, y = Diastolic, color = education_level))
## Warning: Removed 1394 rows containing missing values (geom_point).

#the scatter plots do not show the relationships very well. to investigate further, bmi numbers will be separated into 4 categories. and box plots will be used to compare the distribution of the categories
#create a vector containing 4 bmi categories
description <- c("underweight", "normal weight", "overweight", "obesity")
#create a new vector for bmi category, which will be added to the bmi data frame later
bmi_category <- c()
#empty the placeholder vector called category
category <- c()
#use for and if function to categorize bmi numbers
for (i in seq_along(bmi$bmi)){
if (is.na(bmi$bmi[i])){
category <- NA
}else if (bmi$bmi[i] >= 30){
category <- description[4]
}else if (bmi$bmi[i] >= 25){
category <- description[3]
}else if (bmi$bmi[i] >= 18.5){
category <- description[2]
}else{
category <- description[1]
}
bmi_category <- c(bmi_category, category)
}
#add the bmi_category vector to the bmi data frame
bmi$bmi_category <- bmi_category
#use the factor function to reorder the bmi levels within the data frame so the box plots will appear with an organized order. the order is the same as it appears in the description vector above
bmi$bmi_category <- factor(bmi$bmi_category, levels = description)
#boxplots and summary statistics are used to explore the relationship between education level and blood pressure
boxplot(bmi$Systolic ~ bmi$bmi_category)

boxplot(bmi$Diastolic ~ bmi$bmi_category)

#use the function created above to calculate summary statistics
pressure_measure_summaries_bmi <- create_pressure_measure_summaries(bmi,length(description))
## $systolic
## $systolic$name
## [1] "underweight"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 61.0 96.0 102.0 103.6 109.0 186.0 353
##
## $systolic$name
## [1] "normal weight"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.0 104.0 113.0 116.6 125.0 205.0 565
##
## $systolic$name
## [1] "overweight"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 74.0 109.0 120.0 123.1 133.0 217.0 537
##
## $systolic$name
## [1] "obesity"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 66.0 110.0 121.0 123.8 135.0 224.0 569
##
##
## $diastolic
## $diastolic$name
## [1] "underweight"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.00 55.00 60.00 61.22 65.00 124.00 353
##
## $diastolic$name
## [1] "normal weight"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 60.00 67.00 68.12 75.00 134.00 565
##
## $diastolic$name
## [1] "overweight"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 43.00 65.00 72.00 73.07 80.00 141.00 537
##
## $diastolic$name
## [1] "obesity"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 69.00 76.00 76.87 84.00 151.00 569
#the eighth variable to explore is alcohol consumption frequency. is a high alcohol consumption frequency associated with high blood pressure?
#extract the identifier and drink frequency columns from Alcohol_Use data frame
alcohol <- data.frame('Respondent sequence number.' = dataset_list[["Alcohol_Use"]]$`Respondent sequence number.`, 'drink_frequency' = dataset_list[["Alcohol_Use"]]$`Considering all types of alcoholic beverages, during the past 30 days, how many times did you have {5/4} or more drinks on an occasion?`)
#merge the blood pressure measures and alcohol data frames
alcohol <- merge(Blood_Pressure_Measures_1, alcohol, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#the "777" and "999" numbers in the drink_frequency columns represent "refused to answer" and "don't know". since these answers are not helpful to the analysis, they will be omitted. before that, all the rows containing NA values in drink_frequency column will be removed
alcohol <- alcohol[complete.cases(alcohol[,"drink_frequency"]),]
alcohol <- subset(alcohol, drink_frequency != 777)
alcohol <- subset(alcohol, drink_frequency != 999)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(alcohol, aes(drink_frequency, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(alcohol, aes(drink_frequency, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#explore the interactions between alcohol consumption and categorical variables regarding blood pressure
#gender
alc_gender <- merge(alcohol, gender, all = TRUE)
#Color the data points by gender_description
ggplot(data = alc_gender) + geom_point(mapping = aes(x = drink_frequency, y = Systolic, color = gender_description))
## Warning: Removed 6270 rows containing missing values (geom_point).

ggplot(data = alc_gender) + geom_point(mapping = aes(x = drink_frequency, y = Diastolic, color = gender_description))
## Warning: Removed 6270 rows containing missing values (geom_point).

#use magrittr functions to calculate the means and standard deviations of the interaction
alc_gender_summary_statistics <- alc_gender %>%
select('Respondent sequence number.', drink_frequency, Systolic, Diastolic, gender_description) %>%
group_by(gender_description) %>%
summarize(Mean_alc = mean(drink_frequency, na.rm=TRUE), SD_alc = sd(drink_frequency, na.rm = TRUE),
Mean_sys = mean(Systolic, na.rm=TRUE), SD_sys = sd(Systolic, na.rm = TRUE),
Mean_dia = mean(Diastolic, na.rm=TRUE), SD_dia = sd(Diastolic, na.rm = TRUE))
print(alc_gender_summary_statistics)
## # A tibble: 2 × 7
## gender_description Mean_alc SD_alc Mean_sys SD_sys Mean_dia SD_dia
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female 0.899 3.18 118. 20.8 71.6 12.2
## 2 male 1.73 4.88 122. 18.8 72.5 12.6
#age
alc_age <- merge(alcohol, age, all = TRUE)
#Color the data points by age_group
ggplot(data = alc_age) + geom_point(mapping = aes(x = drink_frequency, y = Systolic, color = age_group))
## Warning: Removed 6270 rows containing missing values (geom_point).

ggplot(data = alc_age) + geom_point(mapping = aes(x = drink_frequency, y = Diastolic, color = age_group))
## Warning: Removed 6270 rows containing missing values (geom_point).

#use magrittr functions to calculate the means and standard deviations of the interaction
alc_age_summary_statistics <- alc_age %>%
select('Respondent sequence number.', drink_frequency, Systolic, Diastolic, age_group) %>%
group_by(age_group) %>%
summarize(Mean_alc = mean(drink_frequency, na.rm=TRUE), SD_alc = sd(drink_frequency, na.rm = TRUE),
Mean_sys = mean(Systolic, na.rm=TRUE), SD_sys = sd(Systolic, na.rm = TRUE),
Mean_dia = mean(Diastolic, na.rm=TRUE), SD_dia = sd(Diastolic, na.rm = TRUE))
print(alc_age_summary_statistics)
## # A tibble: 5 × 7
## age_group Mean_alc SD_alc Mean_sys SD_sys Mean_dia SD_dia
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 children NaN NA 101. 9.89 60.1 8.71
## 2 teens NaN NA 108. 10.4 64.4 8.30
## 3 young adults 1.44 3.87 115. 14.0 73.0 11.4
## 4 middle age 1.54 4.88 127. 19.1 78.3 11.6
## 5 65+ 0.608 3.18 136. 21.8 72.9 12.1
#race
alc_race <- merge(alcohol, race, all = TRUE)
#Color the data points by race
ggplot(data = alc_race) + geom_point(mapping = aes(x = drink_frequency, y = Systolic, color = race_description))
## Warning: Removed 6270 rows containing missing values (geom_point).

ggplot(data = alc_race) + geom_point(mapping = aes(x = drink_frequency, y = Diastolic, color = race_description))
## Warning: Removed 6270 rows containing missing values (geom_point).

#use magrittr functions to calculate the means and standard deviations of the interaction
alc_race_summary_statistics <- alc_race %>%
select('Respondent sequence number.', drink_frequency, Systolic, Diastolic, race_description) %>%
group_by(race_description) %>%
summarize(Mean_alc = mean(drink_frequency, na.rm=TRUE), SD_alc = sd(drink_frequency, na.rm = TRUE),
Mean_sys = mean(Systolic, na.rm=TRUE), SD_sys = sd(Systolic, na.rm = TRUE),
Mean_dia = mean(Diastolic, na.rm=TRUE), SD_dia = sd(Diastolic, na.rm = TRUE))
print(alc_race_summary_statistics)
## # A tibble: 6 × 7
## race_description Mean_alc SD_alc Mean_sys SD_sys Mean_dia SD_dia
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Asian 0.556 2.33 119. 18.8 72.4 11.3
## 2 Black 1.49 4.65 123. 22.0 74.3 13.7
## 3 Mexican American 1.21 3.52 116. 18.6 69.7 11.4
## 4 Other Hispanic 1.13 3.51 120. 20.4 71.6 12.6
## 5 Other Race 2.07 5.68 116. 18.1 70.7 12.7
## 6 White 1.38 4.20 120. 18.8 71.3 11.7
#education
alc_education <- merge(alcohol, education, all = TRUE)
#Color the data points by education level
ggplot(data = alc_education) + geom_point(mapping = aes(x = drink_frequency, y = Systolic, color = education_level))
## Warning: Removed 6270 rows containing missing values (geom_point).

ggplot(data = alc_education) + geom_point(mapping = aes(x = drink_frequency, y = Diastolic, color = education_level))
## Warning: Removed 6270 rows containing missing values (geom_point).

#use magrittr functions to calculate the means and standard deviations of the interaction
alc_education_summary_statistics <- alc_education %>%
select('Respondent sequence number.', drink_frequency, Systolic, Diastolic, education_level) %>%
group_by(education_level) %>%
summarize(Mean_alc = mean(drink_frequency, na.rm=TRUE), SD_alc = sd(drink_frequency, na.rm = TRUE),
Mean_sys = mean(Systolic, na.rm=TRUE), SD_sys = sd(Systolic, na.rm = TRUE),
Mean_dia = mean(Diastolic, na.rm=TRUE), SD_dia = sd(Diastolic, na.rm = TRUE))
print(alc_education_summary_statistics)
## # A tibble: 8 × 7
## education_level Mean_alc SD_alc Mean_sys SD_sys Mean_dia SD_dia
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Less than 9th grade 1.12 3.71 131. 22.2 75.1 12.6
## 2 9-11th grade 2.45 6.39 129. 22.4 75.1 12.4
## 3 High school graduate 1.47 4.14 126. 20.2 75.5 12.2
## 4 Some college 1.33 4.25 124. 19.4 75.6 12.1
## 5 College graduate or above 0.920 3.19 122. 17.9 74.5 11.1
## 6 Refused NaN NA 114 NA 70 NA
## 7 Don't Know 0 0 131. 17.4 69.8 8.30
## 8 <NA> 0.774 1.68 106. 11.0 63.1 8.83
#the ninth variable to explore is whether one is covered by health insurance. do people who have health insurance generally have lower blood pressure?
#extract the identifier and insurance columns from Health_Insurance data frame
health_insurance <- data.frame('Respondent sequence number.' = dataset_list[["Health_Insurance"]]$`Respondent sequence number.`, 'health insurance' = dataset_list[["Health_Insurance"]]$`{Are you/Is SP} covered by health insurance or some other kind of health care plan? [Include health insurance obtained through employment or purchased directly as well as government programs like Medicare and Medicaid that provide medical care or help pay medical bills.]`)
#merge the blood pressure measures and health_insurance data frames
health_insurance <- merge(Blood_Pressure_Measures_1, health_insurance, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#in the health insurance column, 1 represents "yes", and 2 represents "no." add a column to indicate such information
coverd_by_health_insurance <- vector(mode = "character")
#filtered out all the rows that say yes to being covered by health insurance
yes <- health_insurance[health_insurance$health.insurance == 1, ]
#add the coverd_by_health_insurance column to the yes data frame
yes$coverd_by_health_insurance <- rep("yes", length(yes$health.insurance))
#filtered out all the rows that say no to being covered by health insurance
no <- health_insurance[health_insurance$health.insurance == 2, ]
#add the coverd_by_health_insurance column to the no data frame
no$coverd_by_health_insurance <- rep("no", length(no$health.insurance))
#combine the yes and no data frames
health_insurance <- merge(yes, no, all = TRUE)
#boxplots and summary statistics are used to explore the relationship between health insurance and blood pressure
boxplot(health_insurance$Systolic ~ health_insurance$coverd_by_health_insurance)

boxplot(health_insurance$Diastolic ~ health_insurance$coverd_by_health_insurance)

#reuse the systolic_summary_stat and diastolic_summary_stat functions from above
description <- c("yes", "no")
#use the function created above to calculate summary statistics
pressure_measure_summaries_health_insurance <- create_pressure_measure_summaries(health_insurance,length(description))
## $systolic
## $systolic$name
## [1] "yes"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.0 106.0 116.0 119.8 131.0 225.0 1083
##
## $systolic$name
## [1] "no"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 77.0 107.0 118.0 121.1 130.0 220.0 219
##
##
## $diastolic
## $diastolic$name
## [1] "yes"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 31.00 63.00 71.00 71.65 79.00 141.00 1083
##
## $diastolic$name
## [1] "no"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 39.0 65.0 73.0 74.5 82.0 151.0 219
#based on the boxplots, both groups share similar distributions in both kinds of blood pressures. in order to explore the data further, we will create tables and bar charts to explore proportions based on insurance coverage
#create bar charts to compare proportions
ggplot(health_insurance) + geom_bar(mapping = aes(x=coverd_by_health_insurance, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="covered by insurance?")

ggplot(health_insurance) + geom_bar(mapping = aes(x=coverd_by_health_insurance, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="covered by insruance?")

#use the functions created earlier to create count tables based on insurance coverage and by blood pressure levels
table_sys_insurance <-table_sys(health_insurance, "coverd_by_health_insurance")
##
## elevated high normal <NA> Sum
## no 273 358 748 219 1598
## yes 1487 2372 5089 1083 10031
## <NA> 0 0 0 0 0
## Sum 1760 2730 5837 1302 11629
table_dia_insurance <-table_dia(health_insurance, "coverd_by_health_insurance")
##
## high normal <NA> Sum
## no 449 930 219 1598
## yes 2194 6754 1083 10031
## <NA> 0 0 0 0
## Sum 2643 7684 1302 11629
#the tenth variable to explore is income. is high income associated with low blood pressure?
#extract the identifier and poverty level index (a ratio of monthly family income to the HHS poverty guidelines specific to family size) columns from Income data frame
income <- data.frame('Respondent sequence number.' = dataset_list[["Income"]]$`Respondent sequence number.`, 'poverty_level_index' = dataset_list[["Income"]]$`Family monthly poverty level index, a ratio of monthly family income to the HHS poverty guidelines specific to family size.`)
#merge the blood pressure measures and income data frames
income <- merge(Blood_Pressure_Measures_1, income, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(income, aes(poverty_level_index, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(income, aes(poverty_level_index, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#use magrittr functions to calculate the means and standard deviations of the variables
income_summary_statistics <- income %>%
select('Respondent sequence number.', poverty_level_index) %>%
summarize(Mean_in = mean(poverty_level_index, na.rm=TRUE),
SD_inc = sd(poverty_level_index, na.rm = TRUE),
Min_inc = min(poverty_level_index, na.rm = TRUE),
Max_inc = max(poverty_level_index, na.rm = TRUE))
sys_summary_statistics <- income %>%
select('Respondent sequence number.', Systolic) %>%
summarize(Mean_sys = mean(Systolic, na.rm=TRUE),
SD_sys = sd(Systolic, na.rm = TRUE),
Min_sys = min(Systolic, na.rm = TRUE),
Max_sys = max(Systolic, na.rm = TRUE))
dia_summary_statistics <- income %>%
select('Respondent sequence number.', Diastolic) %>%
summarize(Mean_dia = mean(Diastolic, na.rm=TRUE),
SD_dia = sd(Diastolic, na.rm = TRUE),
Min_dia = min(Diastolic, na.rm = TRUE),
Max_dia = max(Diastolic, na.rm = TRUE))
print(income_summary_statistics)
## Mean_in SD_inc Min_inc Max_inc
## 1 2.259799 1.549095 0 5
print(sys_summary_statistics)
## Mean_sys SD_sys Min_sys Max_sys
## 1 120.0046 19.95013 52 225
print(dia_summary_statistics)
## Mean_dia SD_dia Min_dia Max_dia
## 1 72.03574 12.41688 31 151
#the eleventh variable to explore is sleep. is poor sleep associated with high blood pressure?
#extract the identifier and sleep columns from Income data frame
sleep <- data.frame('Respondent sequence number.' = dataset_list[["Sleep_Disorders"]]$`Respondent sequence number.`, 'workday_sleep_hrs' = dataset_list[["Sleep_Disorders"]]$`Number of hours usually sleep on weekdays or workdays.`)
#merge the blood pressure measures and sleep data frames
sleep <- merge(Blood_Pressure_Measures_1, sleep, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#use ggplot to create scatter plots to examine the relationships between the two variables
ggplot(sleep, aes(workday_sleep_hrs, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(sleep, aes(workday_sleep_hrs, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#use magrittr functions to calculate the means and standard deviations of the variable
sleep_summary_statistics <- sleep %>%
select('Respondent sequence number.', workday_sleep_hrs) %>%
summarize(Mean_sleep = mean(workday_sleep_hrs, na.rm=TRUE),
SD_sleep = sd(workday_sleep_hrs, na.rm = TRUE),
Min_sleep = min(workday_sleep_hrs, na.rm = TRUE),
Max_sleep = max(workday_sleep_hrs, na.rm = TRUE))
#the last variable to explore is work schedule. how is one's work schedule related to blood pressure?
#extract the identifier and work schedule columns from Occupation data frame
work_schedule <- data.frame('Respondent sequence number.' = dataset_list[["Occupation"]]$`Respondent sequence number.`, 'work_schedule' = dataset_list[["Occupation"]]$`Which of the following best describes your overall work schedule (include all jobs) for the last three months?`)
#merge the blood pressure measures and work_schedule data frames
work_schedule <- merge(Blood_Pressure_Measures_1, work_schedule, by.x = "Respondent sequence number.", by.y = "Respondent.sequence.number.", all.x = TRUE)
#reuse the description vector to house descriptions for the work_schedule codes
description <- c("nine_to_five", "evening_or_nights", "early_mornings", "variable_or_irregular", "refused", "dont_know")
#create a new vector for work schedule description, which will be added to the work_schedule data frame later
work_schedule_description <- c()
#empty the placeholder vector called category
category <- c()
#use for and if function to match codes with descriptions
for (i in seq_along(work_schedule$work_schedule)){
if (is.na(work_schedule$work_schedule[i])){
category <- "na"
}else if (work_schedule$work_schedule[i] == 1){
category <- description[1]
}else if (work_schedule$work_schedule[i] == 2){
category <- description[2]
}else if (work_schedule$work_schedule[i] == 3){
category <- description[3]
}else if (work_schedule$work_schedule[i] == 5){ #there is no code 4
category <- description[4]
}else if (work_schedule$work_schedule[i] == 7){ #there is no code 6
category <- description[5]
}else{
category <- description[6]
}
work_schedule_description <- c(work_schedule_description, category)
}
#add the work_schedule_description vector to the work_schedule data frame
work_schedule$work_schedule_description <- work_schedule_description
#exclude rows that say "na" in the work_schedule_description column
work_schedule <- work_schedule[work_schedule_description != "na",]
#boxplots and summary statistics are used to explore the relationship between education level and blood pressure
boxplot(work_schedule$Systolic ~ work_schedule$work_schedule_description)

boxplot(work_schedule$Diastolic ~ work_schedule$work_schedule_description)

#use the function created above to calculate summary statistics
pressure_measure_summaries_work_schedule <- create_pressure_measure_summaries(work_schedule,length(description))
## $systolic
## $systolic$name
## [1] "nine_to_five"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 80 110 119 121 130 205 168
##
## $systolic$name
## [1] "evening_or_nights"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 86.0 107.0 117.0 119.1 128.0 210.0 55
##
## $systolic$name
## [1] "early_mornings"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 82.0 111.0 121.0 123.6 133.0 225.0 69
##
## $systolic$name
## [1] "variable_or_irregular"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 76.0 109.0 119.0 121.2 131.0 220.0 175
##
## $systolic$name
## [1] "refused"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 120 120 120 120 120 120 1
##
## $systolic$name
## [1] "dont_know"
##
## $systolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 103 103 103 103 103 103
##
##
## $diastolic
## $diastolic$name
## [1] "nine_to_five"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.0 68.0 75.0 75.6 83.0 134.0 168
##
## $diastolic$name
## [1] "evening_or_nights"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 47.00 65.00 73.00 73.63 81.00 126.00 55
##
## $diastolic$name
## [1] "early_mornings"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 43.0 67.0 75.0 75.5 82.0 127.0 69
##
## $diastolic$name
## [1] "variable_or_irregular"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.00 67.00 74.00 74.93 82.00 151.00 175
##
## $diastolic$name
## [1] "refused"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 78 78 78 78 78 78 1
##
## $diastolic$name
## [1] "dont_know"
##
## $diastolic$summary_statistic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66 66 66 66 66 66
#to explore the variable in a different way. create bar charts to compare proportions
ggplot(work_schedule) + geom_bar(mapping = aes(x=work_schedule_description, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="work schedule?")

ggplot(work_schedule) + geom_bar(mapping = aes(x=work_schedule_description, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="work schedule?")

#use the functions created earlier to create count tables based on insurance coverage and by blood pressure levels
table_sys_ws <-table_sys(work_schedule, "work_schedule_description")
##
## elevated high normal <NA> Sum
## dont_know 0 0 1 0 1
## early_mornings 151 218 336 69 774
## evening_or_nights 115 133 356 55 659
## nine_to_five 393 439 903 168 1903
## refused 1 0 0 1 2
## variable_or_irregular 349 450 871 175 1845
## <NA> 0 0 0 0 0
## Sum 1009 1240 2467 468 5184
table_dia_ws <-table_dia(work_schedule, "work_schedule_description")
##
## high normal <NA> Sum
## dont_know 0 1 0 1
## early_mornings 238 467 69 774
## evening_or_nights 174 430 55 659
## nine_to_five 586 1149 168 1903
## refused 0 1 1 2
## variable_or_irregular 546 1124 175 1845
## <NA> 0 0 0 0
## Sum 1544 3172 468 5184